Stock Market Prediction

In [1]:
import math,random
import quandl
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,SGDRegressor,BayesianRidge,ARDRegression,PassiveAggressiveRegressor,TheilSenRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor,RandomForestRegressor,StackingRegressor,VotingRegressor
from sklearn.neural_network import MLPRegressor
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
In [2]:
stock = 'MSFT'
daysToForecast = 251
In [3]:
def getStockData(stock):
    quandl.ApiConfig.api_key = "qWcicxSctVxrP9PhyneG"
    allData = quandl.get('WIKI/'+stock)
    return allData
In [4]:
def FormatDataForModel(dataArray):
    dataArray = dataArray[['Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume']]
    dataArray['HL_PCT'] = (dataArray['Adj. High'] - dataArray['Adj. Close']) / dataArray['Adj. Close'] * 100.0
    dataArray['PCT_change'] = (dataArray['Adj. Close'] - dataArray['Adj. Open']) / dataArray['Adj. Open'] * 100.0
    dataArray = dataArray[['Adj. Close', 'HL_PCT', 'PCT_change','Adj. Volume']]
    dataArray.fillna(-99999, inplace=True)
    return dataArray
In [5]:
def PreprocessData(mlData,daysToForecast):
    forecast_col = 'Adj. Close'
    forecast_out = int(math.ceil(0.12*daysToForecast))
    mlData['label'] = mlData[forecast_col].shift(-forecast_out)
    #mlData.dropna(inplace=True)
    X = np.array(mlData.drop(['label'],1))
    X = preprocessing.scale(X)
    X_data = X[-daysToForecast:]
    X = X[:-daysToForecast]
    forecastData = mlData[-daysToForecast:]
    trainData= mlData[:-daysToForecast]
    y = np.array(trainData['label'])
    response = [X,y,X_data,forecastData]
    return response
In [6]:
def TrainAndPredict(model,X,y,X_data):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    model.fit(X_train, y_train)
    accuracy = model.score(X_test, y_test)
    prediction = model.predict(X_data)
    return accuracy, prediction
In [7]:
def addPredictionToForecast(prediction,forecastData):
    forecastData = forecastData[['Adj. Close']]
    forecastData = forecastData.rename(columns={'Adj. Close':'EOD'})
    forecastData['prediction'] = prediction[:]
    return forecastData
In [8]:
def GraphPredictions(forecastData,stock):
    fig = px.line(forecastData)
    fig.update_layout(title=stock,
                   xaxis_title='Time',
                   yaxis_title='Price')
    fig.show()
In [9]:
def GraphAllData(allData,forecastData,stock):
    result = pd.concat([allData['Adj. Close'],forecastData['prediction']],axis =1, sort=False)
    fig = px.line(result)
    fig.update_layout(title=stock,
                   xaxis_title='Time',
                   yaxis_title='Price')
    fig.show()
In [10]:
allData = getStockData(stock)
mlData = FormatDataForModel(allData)
X,y,X_data,forecastData = PreprocessData(mlData,daysToForecast)
model = LinearRegression()
accuracy,prediction=TrainAndPredict(model,X,y,X_data)
forecastData = addPredictionToForecast(prediction,forecastData)
In [11]:
print(accuracy)
0.979746845531913
In [12]:
GraphPredictions(forecastData,stock)
In [13]:
GraphAllData(allData,forecastData,stock)
In [14]:
stock_list = ['AAPL', 'IBM', 'MSFT', 'WMT','AMZN','TSLA','HP']

for stock in stock_list:

print("Stock: ", stock)
allData = getStockData(stock)
mlData = FormatDataForModel(allData)
X,y,X_data,forecastData = PreprocessData(mlData,daysToForecast)
model = LinearRegression()
accuracy,prediction=TrainAndPredict(model,X,y,X_data)
print("Accuracy: ", accuracy)
forecastData = addPredictionToForecast(prediction,forecastData)
GraphPredictions(forecastData,stock)
GraphAllData(allData,forecastData,stock)
In [15]:
model_list = [[LinearRegression(), "LinearRegression"],
              [SVR(),"SupportVectorRegression"],
              [MLPRegressor(),"MLPRegressor"],
              [SGDRegressor(),"SGDRegressor"],
              [BayesianRidge(),"BayesianRidge"],
              [ARDRegression(),"ARDRegression"],
              [PassiveAggressiveRegressor(),"PassiveAggressiveRegressor"],
              [TheilSenRegressor(),"TheilSenRegressor"]]
In [16]:
model_results = []
stock_dfs = []
for stock in stock_list:
    print("Stock: ", stock)
    allData = getStockData(stock)
    mlData = FormatDataForModel(allData)
    X,y,X_data,forecastData = PreprocessData(mlData,daysToForecast)
    df_stocks = forecastData[['Adj. Close']]
    df_stocks = df_stocks.rename(columns={'Adj. Close':stock+' Actual'})
    for model,name in model_list:
        accuracy,prediction=TrainAndPredict(model,X,y,X_data)
        print("Model: ",name , "  ","Accuracy:", accuracy)
        model_results.append((name,stock,accuracy))
        df_stocks[name] = prediction[:]
    stock_dfs.append((stock,df_stocks))
Stock:  AAPL
Model:  LinearRegression    Accuracy: 0.9904995880782382
Model:  SupportVectorRegression    Accuracy: 0.9789756147751746
Model:  MLPRegressor    Accuracy: 0.9899485797886548
Model:  SGDRegressor    Accuracy: 0.9900662274969293
Model:  BayesianRidge    Accuracy: 0.989080929911061
Model:  ARDRegression    Accuracy: 0.9893881781315543
Model:  PassiveAggressiveRegressor    Accuracy: 0.9893424857263831
Model:  TheilSenRegressor    Accuracy: 0.9882218310041382
Stock:  IBM
Model:  LinearRegression    Accuracy: 0.9908450911874079
Model:  SupportVectorRegression    Accuracy: 0.9764618838672751
Model:  MLPRegressor    Accuracy: 0.9902708561694092
Model:  SGDRegressor    Accuracy: 0.9906453563355861
Model:  BayesianRidge    Accuracy: 0.9902000281487762
Model:  ARDRegression    Accuracy: 0.9905951699034512
Model:  PassiveAggressiveRegressor    Accuracy: 0.9830956609128382
Model:  TheilSenRegressor    Accuracy: 0.990233714348951
Stock:  MSFT
Model:  LinearRegression    Accuracy: 0.9814061198940892
Model:  SupportVectorRegression    Accuracy: 0.9691400549905326
Model:  MLPRegressor    Accuracy: 0.9824818573732059
Model:  SGDRegressor    Accuracy: 0.9809581192253725
Model:  BayesianRidge    Accuracy: 0.9807964784786197
Model:  ARDRegression    Accuracy: 0.9822949468568271
Model:  PassiveAggressiveRegressor    Accuracy: 0.955840410658951
Model:  TheilSenRegressor    Accuracy: 0.978938337657457
Stock:  WMT
Model:  LinearRegression    Accuracy: 0.9927962926916254
Model:  SupportVectorRegression    Accuracy: 0.9837611880109405
Model:  MLPRegressor    Accuracy: 0.9918806486116416
Model:  SGDRegressor    Accuracy: 0.9918985092482716
Model:  BayesianRidge    Accuracy: 0.9923197206403848
Model:  ARDRegression    Accuracy: 0.9921295946516301
Model:  PassiveAggressiveRegressor    Accuracy: 0.9840649974202172
Model:  TheilSenRegressor    Accuracy: 0.9923245699111453
Stock:  AMZN
Model:  LinearRegression    Accuracy: 0.986270399377029
Model:  SupportVectorRegression    Accuracy: 0.8151575892108207
Model:  MLPRegressor    Accuracy: 0.9819514923281615
Model:  SGDRegressor    Accuracy: 0.9870915359192041
Model:  BayesianRidge    Accuracy: 0.9876957738696029
Model:  ARDRegression    Accuracy: 0.9861834513380118
Model:  PassiveAggressiveRegressor    Accuracy: 0.9800399576557116
Model:  TheilSenRegressor    Accuracy: 0.9855338839612808
Stock:  TSLA
Model:  LinearRegression    Accuracy: 0.9277358806587819
Model:  SupportVectorRegression    Accuracy: 0.8923549812883709
Model:  MLPRegressor    Accuracy: 0.8489898948815576
Model:  SGDRegressor    Accuracy: 0.9343956014758087
Model:  BayesianRidge    Accuracy: 0.9231792112538451
Model:  ARDRegression    Accuracy: 0.924351071863379
Model:  PassiveAggressiveRegressor    Accuracy: 0.9228850885605718
Model:  TheilSenRegressor    Accuracy: 0.9237628226663703
Stock:  HP
Model:  LinearRegression    Accuracy: 0.9737684444736475
Model:  SupportVectorRegression    Accuracy: 0.9570065489314906
Model:  MLPRegressor    Accuracy: 0.9715993201308318
Model:  SGDRegressor    Accuracy: 0.9682195655806677
Model:  BayesianRidge    Accuracy: 0.9661569874528965
Model:  ARDRegression    Accuracy: 0.9693513260666351
Model:  PassiveAggressiveRegressor    Accuracy: 0.9630717557890922
Model:  TheilSenRegressor    Accuracy: 0.9698925783607588
In [17]:
model_names = []
for model,name in model_list:
    model_names.append(name)
df = pd.DataFrame(columns=stock_list,index=model_names)
for i in model_results:
    df.at[i[0],i[1]] = i[2]
df
Out[17]:
AAPL IBM MSFT WMT AMZN TSLA HP
LinearRegression 0.9905 0.990845 0.981406 0.992796 0.98627 0.927736 0.973768
SupportVectorRegression 0.978976 0.976462 0.96914 0.983761 0.815158 0.892355 0.957007
MLPRegressor 0.989949 0.990271 0.982482 0.991881 0.981951 0.84899 0.971599
SGDRegressor 0.990066 0.990645 0.980958 0.991899 0.987092 0.934396 0.96822
BayesianRidge 0.989081 0.9902 0.980796 0.99232 0.987696 0.923179 0.966157
ARDRegression 0.989388 0.990595 0.982295 0.99213 0.986183 0.924351 0.969351
PassiveAggressiveRegressor 0.989342 0.983096 0.95584 0.984065 0.98004 0.922885 0.963072
TheilSenRegressor 0.988222 0.990234 0.978938 0.992325 0.985534 0.923763 0.969893
In [18]:
highest = []
for i in df.columns:
    highest.append([i, df[i].astype(float).idxmax(), df[i].max()])
df_high = pd.DataFrame(highest, columns=["Stock","Model","Accuracy"])
df_high
Out[18]:
Stock Model Accuracy
0 AAPL LinearRegression 0.990500
1 IBM LinearRegression 0.990845
2 MSFT MLPRegressor 0.982482
3 WMT LinearRegression 0.992796
4 AMZN BayesianRidge 0.987696
5 TSLA SGDRegressor 0.934396
6 HP LinearRegression 0.973768
In [19]:
average = df.mean(axis=1)
In [20]:
average.sort_values(ascending=False)
Out[20]:
LinearRegression              0.977617
SGDRegressor                  0.977611
ARDRegression                 0.976328
BayesianRidge                 0.975633
TheilSenRegressor             0.975558
PassiveAggressiveRegressor    0.968334
MLPRegressor                  0.965303
SupportVectorRegression       0.938980
dtype: float64
In [21]:
for stock,stock_df in stock_dfs:
    fig = px.line(stock_df)
    fig.update_layout(title=stock,
                   xaxis_title='Time',
                   yaxis_title='Price')
    fig.show()